First, let’s check if “DESeq2” package is installed. And then, load the package.
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DESeq2)
library(ggplot2)
The computational analysis of an RNA-seq experiment begins from the FASTQ files that contain the nucleotide sequence of each read and a quality score at each position. These reads must first be aligned to a reference genome or transcriptome. The output of this alignment step is commonly stored in a file format called SAM/BAM. This is the workflow we followed last day.
Lets perform some exploratory differential gene expression analysis. Note: this analysis is for demonstration only. NEVER do differential expression analysis this way!
First, load the data.
counts = read.csv("data/airway_scaledcounts.csv", stringsAsFactors = FALSE,row.names = 1)
metadata = read.csv("data/airway_metadata.csv", stringsAsFactors = FALSE)
rmarkdown::paged_table(counts)
rmarkdown::paged_table(metadata)
Subset the metadata into two data frames.
control = metadata[metadata$dex=="control",]
treated = metadata[metadata$dex=="treated",]
rmarkdown::paged_table(control)
rmarkdown::paged_table(treated)
Determine the mean count values for all genes accross control experiments
control.mean = rowSums(counts[,control$id])/length(counts[,control$id])
treated.mean = rowSums(counts[,treated$id])/length(counts[,treated$id])
Let’s store the control.mean and treated.mean together for ease of use
meancounts = data.frame(control.mean, treated.mean)
rmarkdown::paged_table(meancounts)
colSums(meancounts)
## control.mean treated.mean
## 23005324 22196524
Plot the control vs treated.
plot(meancounts[,1:2],xlab="Control",ylab="Treated")
We see a few dozen dots outside of the big clump around the origin. So, let’s try plotting both axes on a log scale.
plot(meancounts[,1:2],log="xy",xlab="log Control",ylab="log Treated")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 15032 x values <= 0
## omitted from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 15281 y values <= 0
## omitted from logarithmic plot
Here we calculate log2foldchange.
meancounts$log2fc = log2(meancounts[,"treated.mean"]/meancounts[,"control.mean"])
We will now remove the NaN and -Inf values. The NaN is returned when you divide by zero and try to take the log. The -Inf is returned when you try to take the log of zero.
zero.vals = which(meancounts[,1:2]==0, arr.ind=TRUE)
to.rm = unique(zero.vals[,1])
meancounts = meancounts[-to.rm,]
rmarkdown::paged_table(meancounts)
How many genes are up in the drug treated cells and how many are down?
up.ind = meancounts$log2fc > 2
down.ind = meancounts$log2fc < (-2)
sum(up.ind)
## [1] 250
sum(down.ind)
## [1] 367
We can add annotation from a supplied CSV file, such as those available from ENSEMBLE or UCSC. The annotables_grch38.csv annotation table links the unambiguous Ensembl gene ID to other useful annotation like the gene symbol, full gene name, location, Entrez gene ID, etc.
anno = read.csv("data/annotables_grch38.csv")
Use the merge funtion to add the annotation data from the “anno” object to our RNA-Seq results in “meancounts”
# use the merge function
meancounts.anno = merge(meancounts, anno, by.x="row.names", by.y="ensgene")
rmarkdown::paged_table(meancounts.anno)
We can use the mapIds() function to add individual columns to our results table.
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
meancounts$symbol <- mapIds(org.Hs.eg.db,
keys=row.names(meancounts),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
meancounts$entrez <- mapIds(org.Hs.eg.db,
keys=row.names(meancounts),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
meancounts$uniprot <- mapIds(org.Hs.eg.db,
keys=row.names(meancounts),
column="UNIPROT",
keytype="ENSEMBL",
multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
rmarkdown::paged_table(meancounts[up.ind,])
rmarkdown::paged_table(meancounts[down.ind,])
DESeq is Differential gene expression analysis based on the negative binomial distribution. Setup the object needed for DESeq analysis
counts <- read.csv("data/airway_scaledcounts.csv", stringsAsFactors = FALSE)
dds = DESeqDataSetFromMatrix(countData=counts,
colData=metadata,
design=~dex,
tidy=TRUE)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res = results(dds)
summary(res)
##
## out of 25258 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1563, 6.2%
## LFC < 0 (down) : 1188, 4.7%
## outliers [1] : 142, 0.56%
## low counts [2] : 9971, 39%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res = res[order(res$pvalue),]
rmarkdown::paged_table(as.data.frame(res))
write.csv(res, file="data/deseq_results.csv")
Change the p-value threshold again to 0.01
res01 = results(dds, alpha=0.01)
summary(res01)
##
## out of 25258 with nonzero total read count
## adjusted p-value < 0.01
## LFC > 0 (up) : 850, 3.4%
## LFC < 0 (down) : 581, 2.3%
## outliers [1] : 142, 0.56%
## low counts [2] : 9033, 36%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Store our results as a data.frame object, sort or order our results by the adjusted p-value
resSig01 = subset(as.data.frame(res), padj < 0.01)
rmarkdown::paged_table(resSig01[order(resSig01$padj),])
Let’s first see what the gene ID is for the CRISPLD2 gene using:
resSig01[grep("CRISPLD2", resSig01$symbol),]
## [1] baseMean log2FoldChange lfcSE stat
## [5] pvalue padj
## <0 rows> (or 0-length row.names)
rownames(resSig01[grep("CRISPLD2", resSig01$symbol),])
## character(0)
We can mow use this returned object to plot a boxplot with the base graphics function
d = plotCounts(dds, gene="ENSG00000103196", intgroup="dex", returnData=TRUE)
head(d)
## count dex
## SRR1039508 774.5002 control
## SRR1039509 6258.7915 treated
## SRR1039512 1100.2741 control
## SRR1039513 6093.0324 treated
## SRR1039516 736.9483 control
## SRR1039517 2742.1908 treated
boxplot(count ~ dex , data=d)
Or use ggplot2
ggplot(d, aes(dex, count)) +
geom_boxplot(aes(fill=dex)) +
scale_y_log10() +
ggtitle("CRISPLD2")
Volcano plot
# Setup our custom point color vector
mycols = rep("gray", nrow(res))
mycols[ res$padj < 0.01 ] <- "red"
mycols[ abs(res$log2FoldChange) > 2 ] <- "blue"
mycols[ (res$padj < 0.01) & (abs(res$log2FoldChange) > 2 ) ] <- "green"
palette( c("gray","blue") )
plot( res$log2FoldChange, -log(res$padj), col=mycols, ylab="-Log(P-value)", xlab="Log2(FoldChange)")
# Add some cut-off lines
abline(v=c(-2,2), col="darkgray", lty=2)
abline(h=-log(0.1), col="darkgray", lty=2)
Or use ggplot2
ggplot(as.data.frame(res), aes(log2FoldChange, -log10(pvalue), col=mycols)) +
geom_point() +
ggtitle("Volcano plot")
## Warning: Removed 13578 rows containing missing values (geom_point).